Feature generation
Generated new features using primarily pressure data from the kiln logs. These new features, along with weather data, were then merged with both lot yields and item yields to determine if any of these factors influence yields in any way.
Lot yields
With lot yields, LASSO regression was used to eliminate the least important variables on a per kiln basis. When testing for variable importance on each kiln, none of the generated features were determined to be beneficial to the model. In other words, simply using the mean value of the lot yield produced better results than including any of the generated features.
Item yields
Some of the top occurring items were also tested. Unfortunately, similar results were observed where most features end up being eliminated. In some cases, the variables show very minor importance, which upon further investigation seems to be due to outliers in the dataset.
Next?
Its appears that none of the features, when singled out, play a significant linear role in determination of yields, whether it be yields of entire kilns, or of specific items within the kilns.
When looking at the Feature plots below, we ideally want to see a horizontal separation of the colored Yields points which would indicate that a change in that feature is responsible for a change in the yield value. Unfortunately, most of the below models indicate that a horizontal line (the mean yield value) has more predictive power than using any of the currently generated features.
We may need to look at additional features. It’s also possible there are more complex interactions between the features not accounted for in these models.
Using the kiln pressure data we can generate features and determine if they have an effect on yields. Some of the features discussed include:
Durations between (0.01 <= pressure <= 0.03) were split into: (0.01 <= pressure < 0.02) and (0.02 <= pressure < 0.03).
press_calcs <- kilns_All %>%
group_by(LOTNO) %>%
mutate(
# adust pressure max and min
pressure = case_when(pressure > .07 ~ .07,
pressure < -.07 ~ -.07,
TRUE ~ pressure
),
# find time index where temp reaches 1200F
close_1200 = if_else( (time < index_max_temp), (abs(1200 - avg_kiln_temp)), NULL ))
# join time index to original
press_calcs <- left_join(press_calcs, press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(index_1200F = which.min(close_1200))
)
press_calcs <- press_calcs %>%
group_by(LOTNO) %>%
mutate(
# segregate pressures into positive and negative columns
press_pos = if_else((pressure >= 0) & (time <= index_1200F), pressure, NULL),
press_neg = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# get all pressures below 1200F point for easy mean, sd calcs
press_1200F = if_else(time <= index_1200F, pressure, NULL),
# segregate pressures into range columns
press_greater_03 = if_else((pressure >= .03) & (time <= index_1200F), pressure, NULL),
press_betw_02_03 = if_else((pressure >= .02) & (pressure < .03) & (time <= index_1200F), pressure, NULL),
press_betw_01_02 = if_else((pressure >= .01) & (pressure < .02) & (time <= index_1200F), pressure, NULL),
press_betw_00_01 = if_else((pressure >= .0) & (pressure < .01) & (time <= index_1200F), pressure, NULL),
press_less_00 = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# add counter columns for easy aggregation of total times spent in range
press_pos_count = if_else(!is.na(press_pos),1,0),
press_neg_count = if_else(!is.na(press_neg),1,0),
press_greater_03_count = if_else(!is.na(press_greater_03),1,0),
press_betw_02_03_count = if_else(!is.na(press_betw_02_03),1,0),
press_betw_01_02_count = if_else(!is.na(press_betw_01_02),1,0),
press_betw_00_01_count = if_else(!is.na(press_betw_00_01),1,0),
press_less_00_count = if_else(!is.na(press_less_00),1,0)
) %>%
# remove helper columns
dplyr::select(-c(close_1200)) %>%
dplyr::select(c(LOTNO,index_max_temp,index_1200F,everything()))
# summarise times at pressures
press_summ <- press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(
# sum times at negative and positive pressure
time_at_pos = sum(press_pos_count, na.rm=TRUE),
time_at_neg = sum(press_neg_count, na.rm=TRUE),
# get means and sd of pressures
mean_press = mean(press_1200F, na.rm=TRUE),
sd_press = sd(press_1200F, na.rm=TRUE),
# sum times at pressure ranges
time_greater_03 = sum(press_greater_03_count, na.rm=TRUE),
time_betw_02_03 = sum(press_betw_02_03_count, na.rm=TRUE),
time_betw_01_02 = sum(press_betw_01_02_count, na.rm=TRUE),
time_betw_00_01 = sum(press_betw_00_01_count, na.rm=TRUE),
time_less_00 = sum(press_less_00_count, na.rm=TRUE),
# pos and neg AUC pressures
AUC_pos_press = sum(press_pos, na.rm=TRUE),
AUC_neg_press = sum(abs(press_neg), na.rm=TRUE)
)
# join summarised features to original pressure data for easy plotting and verification
press_joined <- left_join(press_calcs, press_summ) %>%
mutate(
pct_time_at_pos = time_at_pos/index_1200F,
pct_time_at_neg = time_at_neg/index_1200F,
pct_time_greater_03 = time_greater_03/index_1200F,
pct_time_betw_02_03 = time_betw_02_03/index_1200F,
pct_time_betw_01_02 = time_betw_01_02/index_1200F,
pct_time_betw_00_01 = time_betw_00_01/index_1200F,
pct_time_less_00 = time_less_00/index_1200F
) %>%
# remove helper columns
dplyr::select(-c(press_pos_count,
press_neg_count,
press_greater_03_count,
press_betw_02_03_count,
press_betw_01_02_count,
press_betw_00_01_count,
press_less_00_count,
press_1200F
))
# attached aucDiff to each df
press_summ <- left_join(press_summ, allAucValues)
press_joined <- left_join(press_joined, allAucValues)
# add kiln column for quick filtering
press_summ <- press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]")))
# add weather data as well..
press_summ <- df_yields %>%
group_by(LOTNO) %>% slice(1) %>% ungroup() %>%
dplyr::select(LOTNO, precip:temp_avg) %>%
right_join(press_summ)
# prepare lot yield metric
# get lot yields by summarising fired and rejects
lot_yields <-
df_yields %>%
group_by(LOTNO) %>%
dplyr::summarise(
lot_items_fired = sum(TOTAL_ITEM_FIRED),
lot_items_rejected = sum(TOTAL_ITEM_REJECTED),
lot_yield = (lot_items_fired - lot_items_rejected) / lot_items_fired
) %>%
dplyr::select(LOTNO, lot_yield)
# join yields with features, remove 20 missing lot yield values
press_summ <- left_join(press_summ, lot_yields) %>%
na.omit()
# remove zero value lot yields as outliers
press_summ <- press_summ %>%
mutate(lot_yield = ifelse( (lot_yield < .5), NA, lot_yield)) %>%
na.omit()
Quick visualization of randomly selected lots to verify that metrics seem to be produced correctly.
# sample random lots
set.seed(123)
sampleC <- sample(levels(kilns_C$LOTNO),1)
sampleD <- sample(levels(kilns_D$LOTNO),1)
sampleE <- sample(levels(kilns_E$LOTNO),1)
sampleF <- sample(levels(kilns_F$LOTNO),1)
sampleG <- sample(levels(kilns_G$LOTNO),2)
sampleH <- sample(levels(kilns_H$LOTNO),2)
sample <- c(sampleC, sampleD, sampleE, sampleF)
# scale and shift values for second y-axis
sec_y_scale=20000
sec_y_shift=1500
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
# sample random lots
set.seed(8634)
sample <- sample(levels(kilns_All$LOTNO),4)
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
Do any of these features have an impact on overall lot yields for each kiln? In general it seems the answer is still “maybe?”.
When fitting standard linear models using all available predictors (see table below), \(adjusted R^2\) values are either low or negative. Low values indicate high probability that the effect is random (regardless of significant \(p.values\) in the model), while negative values indicate that the resulting linear model is worse fitting than a horizontal line.
Although low \(p.values\) are often used for variable selection or pruning, the practice is frowned upon. We use LASSO regression (a type of linear regression) in this case which works by shrinking the coefficients of less significant variables.
Standard linear models
# linear model on each kiln
df <- press_summ %>%
dplyr::select(-LOTNO, -time_at_pos, -time_at_neg, -temp_avg)
set.seed(234)
df_split <- initial_split(df)
df_train <- training(df_split)
df_test <- testing(df_split)
lms <- df %>%
nest(data = c(-kiln)) %>%
mutate(fit = map(data, ~lm(lot_yield ~ ., data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
# glanced
lm_glance <- lms %>%
unnest(glanced) %>%
select(-data,-fit,-tidied,-augmented) %>%
select(kiln, adj.r.squared,AIC,BIC) %>%
arrange(-adj.r.squared) %>%
mutate_if(is.numeric, round, 5)
kable(lm_glance %>%
left_join(count(press_summ$kiln) %>%
mutate(kiln = factor(x)) %>%
select(kiln, freq)) %>%
select(kiln, freq, everything()),
format="html") %>%
kable_styling(full_width=T)
| kiln | freq | adj.r.squared | AIC | BIC |
|---|---|---|---|---|
| D | 87 | 0.08939 | -214.7417 | -172.82131 |
| C | 53 | 0.05258 | -130.9347 | -99.41004 |
| G | 230 | 0.00340 | -764.1599 | -705.71250 |
| H | 180 | -0.02122 | -628.6351 | -574.35483 |
| F | 116 | -0.07592 | -310.2808 | -263.46975 |
| E | 15 | NaN | -Inf | -Inf |
lm_tidy <- lms %>%
unnest(tidied) %>%
select(-data,-fit,-glanced,-augmented) %>%
arrange(kiln, p.value) %>%
mutate_if(is.numeric, round, 5) %>%
mutate(
p.value = ifelse(p.value < .05,
cell_spec(p.value,"html",color="red"),
cell_spec(p.value,"html",color="black"))
)
# filter kiln
Kiln = "C"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield, -time_greater_03)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-18])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
kable(lm_tidy %>% dplyr::filter(kiln == "C"),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(lm_glance$adj.r.squared[[2]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln C" = 16)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln C | |||||
| C | (Intercept) | 0.77728 | 0.10947 | 7.10074 | 0 |
| C | mean_press | -50.65001 | 26.14434 | -1.93732 | 0.06017 |
| C | time_betw_00_01 | 0.00018 | 0.00009 | 1.90540 | 0.06432 |
| C | AUC_neg_press | -0.03511 | 0.02000 | -1.75592 | 0.08716 |
| C | sd_press | 14.27430 | 11.19041 | 1.27558 | 0.20985 |
| C | aucDiff | 0.00000 | 0.00000 | -0.86904 | 0.39028 |
| C | temp_max | 0.00084 | 0.00108 | 0.78094 | 0.43967 |
| C | precip | -0.03094 | 0.04218 | -0.73357 | 0.46771 |
| C | temp_min | 0.00086 | 0.00124 | 0.68991 | 0.49444 |
| C | time_betw_01_02 | 0.00157 | 0.00253 | 0.62276 | 0.53716 |
| C | snow_fall | -0.00717 | 0.01385 | -0.51753 | 0.60779 |
| C | time_less_00 | 0.00003 | 0.00007 | 0.41088 | 0.68347 |
| C | AUC_pos_press | -0.01545 | 0.04457 | -0.34669 | 0.73073 |
| C | time_betw_02_03 | 0.00181 | 0.00697 | 0.25954 | 0.79662 |
| C | snow_depth | 0.00039 | 0.00441 | 0.08933 | 0.92929 |
| C | time_greater_03 | NA | NA | NA | NA |
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.6,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
# filter kiln
Kiln = "D"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-19])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
kable(lm_tidy %>% dplyr::filter(kiln == "D"),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(lm_glance$adj.r.squared[[1]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln D" = 16)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln D | |||||
| D | (Intercept) | 0.85510 | 0.10304 | 8.29860 | 0 |
| D | time_betw_02_03 | -0.05802 | 0.02300 | -2.52321 | 0.01387 |
| D | aucDiff | 0.00000 | 0.00000 | -2.06435 | 0.04264 |
| D | time_less_00 | 0.00013 | 0.00011 | 1.21499 | 0.2284 |
| D | snow_fall | 0.01892 | 0.01604 | 1.17939 | 0.24218 |
| D | time_betw_00_01 | 0.00016 | 0.00017 | 0.95080 | 0.34493 |
| D | snow_depth | -0.00317 | 0.00347 | -0.91394 | 0.36384 |
| D | sd_press | -5.95371 | 7.44018 | -0.80021 | 0.42626 |
| D | time_greater_03 | 0.01358 | 0.01917 | 0.70828 | 0.48109 |
| D | AUC_pos_press | -0.00809 | 0.01154 | -0.70091 | 0.48565 |
| D | time_betw_01_02 | -0.00077 | 0.00132 | -0.58766 | 0.55863 |
| D | AUC_neg_press | -0.01128 | 0.02424 | -0.46523 | 0.64319 |
| D | precip | 0.01215 | 0.02844 | 0.42741 | 0.67038 |
| D | mean_press | -7.85613 | 27.96708 | -0.28091 | 0.7796 |
| D | temp_min | -0.00024 | 0.00106 | -0.22580 | 0.822 |
| D | temp_max | 0.00001 | 0.00097 | 0.01156 | 0.99081 |
# D_pruned <- press_summ %>%
# dplyr::filter(kiln == "D") %>%
# dplyr::select(mean_press:aucDiff,lot_yield)
#
# lm(lot_yield ~ ., data = D_pruned) %>% summary()
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.5,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
# filter kiln
Kiln = "E"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-19])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
## $title
## [1] "Kiln E, Variable Importance by LASSO"
##
## attr(,"class")
## [1] "labels"
No p-values when using all predictors due to number of observations being so low (and number of predictors being so high). Below model produced omitting weather data.
E_data <- press_summ %>%
dplyr::filter(kiln == "E") %>%
dplyr::select(mean_press:aucDiff, lot_yield)
E_lm <- lm(lot_yield ~ ., data = E_data)
E_summ <- E_lm %>% summary()
kable(lm(lot_yield ~ ., data = E_data) %>%
tidy() %>%
arrange(p.value),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(E_summ$adj.r.squared, 3)),
escape = F) %>%
pack_rows( index=c("Kiln E" = 11)) %>%
kable_styling("striped", full_width=T)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| Kiln E | ||||
| mean_press | -103.0961658 | 50.7304220 | -2.0322355 | 0.1119276 |
| (Intercept) | 0.7219258 | 0.4574865 | 1.5780264 | 0.1896984 |
| time_betw_01_02 | 0.0009585 | 0.0006480 | 1.4793267 | 0.2131441 |
| time_betw_02_03 | 0.0251439 | 0.0209406 | 1.2007273 | 0.2960986 |
| time_betw_00_01 | 0.0004330 | 0.0003836 | 1.1289150 | 0.3220531 |
| AUC_neg_press | -0.3272987 | 0.3034684 | -1.0785264 | 0.3414964 |
| time_less_00 | 0.0007057 | 0.0006890 | 1.0242098 | 0.3636325 |
| time_greater_03 | -0.0537037 | 0.0632761 | -0.8487201 | 0.4438437 |
| AUC_pos_press | 0.0275909 | 0.0545036 | 0.5062221 | 0.6393270 |
| aucDiff | -0.0000014 | 0.0000069 | -0.2005447 | 0.8508386 |
| sd_press | -15.8911025 | 88.2164944 | -0.1801375 | 0.8658025 |
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.8,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
No significant features when using p-value as a metric.
# filter kiln
Kiln = "F"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-19])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
kable(lm_tidy %>% dplyr::filter(kiln == "F"),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(lm_glance$adj.r.squared[[5]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln F" = 16)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln F | |||||
| F | (Intercept) | 0.93529 | 0.05620 | 16.64232 | 0 |
| F | aucDiff | 0.00000 | 0.00000 | 1.37114 | 0.1734 |
| F | time_betw_00_01 | -0.00017 | 0.00014 | -1.18492 | 0.23886 |
| F | AUC_pos_press | 0.02645 | 0.02582 | 1.02443 | 0.30811 |
| F | precip | -0.01534 | 0.02245 | -0.68315 | 0.49609 |
| F | snow_depth | -0.00180 | 0.00273 | -0.66039 | 0.51052 |
| F | time_betw_01_02 | -0.00022 | 0.00047 | -0.47881 | 0.63312 |
| F | sd_press | 2.03306 | 4.30566 | 0.47218 | 0.63782 |
| F | time_greater_03 | -0.00074 | 0.00165 | -0.44541 | 0.65698 |
| F | temp_max | -0.00032 | 0.00073 | -0.44158 | 0.65975 |
| F | AUC_neg_press | -0.00275 | 0.00695 | -0.39564 | 0.69322 |
| F | time_betw_02_03 | -0.00032 | 0.00117 | -0.27458 | 0.78421 |
| F | temp_min | 0.00018 | 0.00080 | 0.22823 | 0.81993 |
| F | time_less_00 | -0.00001 | 0.00005 | -0.19171 | 0.84836 |
| F | mean_press | -1.17839 | 11.33657 | -0.10395 | 0.91742 |
| F | snow_fall | 0.00025 | 0.01097 | 0.02310 | 0.98162 |
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.5,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
# filter kiln
Kiln = "G"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-19])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
kable(lm_tidy %>% dplyr::filter(kiln == "G"),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(lm_glance$adj.r.squared[[3]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln G" = 16)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln G | |||||
| G | (Intercept) | 0.87064 | 0.05984 | 14.54903 | 0 |
| G | time_less_00 | 0.00009 | 0.00005 | 1.59461 | 0.11228 |
| G | AUC_neg_press | -0.00957 | 0.00808 | -1.18511 | 0.23729 |
| G | snow_fall | -0.00364 | 0.00324 | -1.12282 | 0.26277 |
| G | snow_depth | 0.00171 | 0.00166 | 1.02978 | 0.30427 |
| G | precip | -0.00835 | 0.01211 | -0.68932 | 0.49137 |
| G | time_betw_00_01 | 0.00004 | 0.00006 | 0.62366 | 0.53351 |
| G | sd_press | -1.12384 | 1.91632 | -0.58646 | 0.55819 |
| G | temp_max | 0.00021 | 0.00036 | 0.58161 | 0.56144 |
| G | time_betw_02_03 | 0.00006 | 0.00014 | 0.40997 | 0.68224 |
| G | temp_min | -0.00015 | 0.00041 | -0.37007 | 0.71169 |
| G | aucDiff | 0.00000 | 0.00000 | 0.20308 | 0.83926 |
| G | time_betw_01_02 | 0.00001 | 0.00009 | 0.13923 | 0.8894 |
| G | AUC_pos_press | 0.00053 | 0.00826 | 0.06479 | 0.9484 |
| G | mean_press | -0.21427 | 6.24677 | -0.03430 | 0.97267 |
| G | time_greater_03 | 0.00001 | 0.00022 | 0.02473 | 0.9803 |
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
dplyr::filter(lot_yield > .4) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.4,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
No significant features when using p-value as a metric.
# filter kiln
Kiln = "H"
df <- press_summ %>%
dplyr::filter(kiln == Kiln) %>%
dplyr::select(precip:aucDiff, lot_yield)
# normalize all predictors
df_norm <- recipe(lot_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$lot_yield)
x = as.matrix(df_norm[-19])
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0("Kiln ", Kiln, ", Variable Importance by LASSO"))
kable(lm_tidy %>% dplyr::filter(kiln == "H"),
format="html",
caption = paste0("Significant variables by p-value, adj.R^2: ",
round(lm_glance$adj.r.squared[[4]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln H" = 16)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln H | |||||
| H | (Intercept) | 0.95661 | 0.11408 | 8.38530 | 0 |
| H | snow_fall | 0.00473 | 0.00381 | 1.24081 | 0.21645 |
| H | time_greater_03 | -0.00038 | 0.00035 | -1.08943 | 0.27756 |
| H | aucDiff | 0.00000 | 0.00000 | -1.05868 | 0.2913 |
| H | time_betw_02_03 | -0.00019 | 0.00019 | -0.98821 | 0.32451 |
| H | sd_press | 2.65031 | 3.01026 | 0.88042 | 0.37992 |
| H | time_betw_00_01 | -0.00008 | 0.00011 | -0.80354 | 0.42282 |
| H | AUC_pos_press | 0.01408 | 0.01754 | 0.80278 | 0.42326 |
| H | time_betw_01_02 | -0.00008 | 0.00012 | -0.71793 | 0.47382 |
| H | AUC_neg_press | -0.01066 | 0.01748 | -0.60977 | 0.54286 |
| H | mean_press | -10.83243 | 18.28962 | -0.59227 | 0.55448 |
| H | time_less_00 | -0.00006 | 0.00011 | -0.55437 | 0.58008 |
| H | temp_min | -0.00023 | 0.00047 | -0.49270 | 0.62288 |
| H | temp_max | 0.00015 | 0.00042 | 0.34538 | 0.73025 |
| H | precip | -0.00276 | 0.01207 | -0.22849 | 0.81955 |
| H | snow_depth | 0.00002 | 0.00161 | 0.01433 | 0.98859 |
df %>%
mutate(Yields = case_when( lot_yield >= 0.95 ~ "over95",
lot_yield >= 0.5 ~ "over50",
TRUE ~ "below50")) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=lot_yield))+
geom_point(aes(fill=Yields),alpha=.5,shape=21,size=2)+
scale_y_continuous(labels = percent_format())+
facet_wrap(~measures,scales='free_x',nrow=5)+
geom_smooth(method=lm, se=FALSE)
Perform analysis on items rather than kilns. This may shed light on whether certain items produce better yields under different circumstances than others.
# df yields join with press summ ... do pressures impact yields of items?
# get item yields, join with pressure stats
item_yields <- df_yields %>%
dplyr::select(LOTNO, ITEM, DESCRIPTION, FIRE_DATE, total_item_pct_yield) %>%
na.omit() %>%
right_join(press_summ)
item = "215.5MMBODX38MM,45PPI,PSZM,FEC,1/8\"FG"
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == "215.5MMBODX38MM,45PPI,PSZM,FEC,1/8\"FG") %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(234)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(234)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
sample <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO) %>% unlist()
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(1) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(5532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield < .5) %>%
select(LOTNO) %>% unlist() %>% as.vector()
set.seed(5532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(8) %>% as.vector()
set.seed(5532)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.7) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(7) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(7) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(5532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% as.vector()
set.seed(5532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(8) %>% as.vector()
set.seed(5532)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(7) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(9) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(5532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
set.seed(5532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(4) %>% as.vector()
set.seed(5532)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(12) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(5532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
set.seed(5532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
set.seed(5532)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(17) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(5532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
set.seed(5532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
set.seed(5532)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(26) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
set.seed(532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(5) %>% as.vector()
set.seed(552)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
item <- count(item_yields$DESCRIPTION) %>%
arrange(-freq) %>%
slice(27) %>%
dplyr::select(x) %>%
unlist()
item_df <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
mutate(Yields = case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50"))
item_df %>%
ggplot(aes(x=FIRE_DATE,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=3, shape=21, alpha=.8)+
geom_smooth(method=lm, se=FALSE)+
geom_line(alpha=.2)+
# theme(legend.position = "none")+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
scale_fill_brewer(type="qual",palette=2)+
ggtitle(paste0(item))+
labs(x="Fire date", y="Item yield")
df <- item_df %>%
dplyr::select(total_item_pct_yield:aucDiff)
# normalize all predictors
df_norm <- recipe(total_item_pct_yield ~ ., data = df) %>%
step_normalize(all_predictors()) %>%
prep() %>%
juice()
# split predictors and responses
y = as.vector(df_norm$total_item_pct_yield)
x = as.matrix(df_norm[-19])
set.seed(344)
# find best lambda value
cv.glmmod <- cv.glmnet(x,y,alpha=1)
# plot(cv.glmmod)
best.lambda <- cv.glmmod$lambda.min
set.seed(344)
# produce model
glmmod <- glmnet(x, y, alpha=1, lambda = best.lambda)
# store coefficient values
coefs = data.frame(coef(glmmod)[,1])
# plot variable importance
tibble(data.frame(cbind(rownames(coefs), coefs))) %>%
set_colnames(c("Variable", "Importance")) %>%
mutate(Sign = if_else(Importance < 0, "Neg", "Pos"),
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance),
HJ = if_else(Importance > .5, 1,0)) %>%
arrange(-Importance) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
geom_text(aes(label=round(Importance,5), hjust=HJ))+
ggtitle(paste0(item, ", VI by LASSO"))
item_df %>%
dplyr::select(total_item_pct_yield:Yields) %>%
pivot_longer(cols = precip:aucDiff, names_to = "measures", values_to="values") %>%
ggplot(aes(x=values,y=total_item_pct_yield))+
geom_point(aes(fill=Yields),size=2,shape=21,alpha=.8)+
scale_y_continuous(limits = c(0,1), labels = percent_format())+
facet_wrap(~measures,scales='free_x')+
geom_smooth(method=lm, se=FALSE)
set.seed(532)
a <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield <= .5) %>%
select(LOTNO) %>% unlist() %>% sample(3) %>% as.vector()
set.seed(532)
b <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter(total_item_pct_yield >= .95) %>%
select(LOTNO) %>% unlist() %>% sample(6) %>% as.vector()
set.seed(552)
c <- item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::filter((total_item_pct_yield >.5) & (total_item_pct_yield < .95)) %>%
select(LOTNO) %>% unlist() %>% sample(7) %>% as.vector()
sample <- c(a,b,c)
# sample <- item_yields %>%
# dplyr::filter(DESCRIPTION == item) %>%
# dplyr::select(LOTNO) %>% unlist() %>%
# sample(16)
sec_y_scale=20000
sec_y_shift=1500
press_joined_item <- press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
left_join(
item_yields %>%
dplyr::filter(DESCRIPTION == item) %>%
dplyr::select(LOTNO,total_item_pct_yield)
) %>%
mutate(classify = factor(case_when( total_item_pct_yield >= 0.95 ~ "over95",
total_item_pct_yield >= 0.5 ~ "over50",
TRUE ~ "below50")))
press_joined_item %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over95") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "over50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)
press_joined_item %>%
dplyr::filter(classify == "below50") %>%
# kiln temp, setpoint, pressure
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# color points
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# yields
geom_label(data = press_joined_item %>% dplyr::filter(LOTNO %in% sample) %>%
distinct(total_item_pct_yield),
aes(x = 60 * 65, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( "Yield: ", mypercent(total_item_pct_yield) )
),
label.padding = unit(0.2, "lines"),
hjust=1,vjust=.5,
fill='lightskyblue',
label.size=.1
) +
facet_wrap(~LOTNO)